#!/bin/bash
function check_dependencies {
	command -v convert >/dev/null 2>&1 || { echo >&2 "No 'pdftotext' installed. On most Linux distributions, pdftotext is part of 'poppler-utils' package."; exit 1; }
	command -v tesseract >/dev/null 2>&1 || { echo >&2 "No 'tesseract' installed. Install 'tesseract-ocr' package."; exit 1; }
}

function count_words_in_pdf {
	pdftotext "$PDFFILE" "/tmp/temp.txt"
	local AMOUNT_OF_WORDS=$(wc -w < "/tmp/temp.txt")
	rm /tmp/temp.txt
	echo "$AMOUNT_OF_WORDS"
}

PDFFILE=$1
LANG=deu	# See man tesseract > LANGUAGES
MIN_WORDS=5	# Number of words required to accept pdftotext result.

check_dependencies

PREV_IFS=$IFS
IFS='\0'

if [[ -z "$PDFFILE" ]]; then
	echo "syntax: $(basename $0) <INPUT PDF>"
	exit 1
fi

# Check if pdf already has embedded text.
WORDCOUNT=$(count_words_in_pdf $PDFFILE)

# If that fails, try Tesseract.
if [[ $WORDCOUNT -lt $MIN_WORDS ]]
then
	echo "Attempting OCR extraction..."

	# Use imagemagick to convert the PDF to a high-rest multi-page TIFF.
	convert -density 300 "$PDFFILE" -depth 8 -strip -background white -alpha off /tmp/temp.tiff

	FILE_BASENAME=$(dirname "$PDFFILE")/$(basename "$PDFFILE" .pdf)

	# Then use Tesseract to perform OCR on the tiff.
	tesseract /tmp/temp.tiff "$FILE_BASENAME" -l $LANG pdf

	# We don't need then intermediate TIFF file, so discard it.
	rm /tmp/temp.tiff

	WORDCOUNT=$(count_words_in_pdf $PDFFILE)

	echo "$WORDCOUNT words in new sandwiched pdf \O/"
else
	echo "pdftotext extracted $WORDCOUNT words. Skip OCR process."
fi



IFS=$PREV_IFS
